Trabalho final de R do grupo

Participantes: Adriana, Bruno, Rafael e Vinicius

Carregando as Packages necessarias

library(magrittr)         # quando der problema com o "%>%"
library(dplyr)            # selecao e filtro de dados
library(geosphere)        # localizacao geoespacial
library(lubridate)        # datas, funcoes hour, month, wday
library(plotly)           # plot dos graficos
library(knitr)            # usada pelo plotly
library(dummies)          # cria colunas binarias para variaveis categoricas
library(scales)           # normaliza dados rescalando para float de 0 a 1
library(randomForest)     # cria rede neural para criar regressao de tempo de viagem
library(tidyverse)
library(yaml)
source('preprocessing.R')
source('mapa_calor_ny.R')

Carregando o Dataset (compactado)

read.csv(gzfile("./data_source/train.csv.gz")) %>% 
  as.data.frame() -> train

Ver os primeiros 3 registros

head(train, 3)
##          id vendor_id     pickup_datetime    dropoff_datetime
## 1 id2875421         2 2016-03-14 17:24:55 2016-03-14 17:32:30
## 2 id2377394         1 2016-06-12 00:43:35 2016-06-12 00:54:38
## 3 id3858529         2 2016-01-19 11:35:24 2016-01-19 12:10:48
##   passenger_count pickup_longitude pickup_latitude dropoff_longitude
## 1               1        -73.98215        40.76794         -73.96463
## 2               1        -73.98042        40.73856         -73.99948
## 3               1        -73.97903        40.76394         -74.00533
##   dropoff_latitude store_and_fwd_flag trip_duration
## 1         40.76560                  N           455
## 2         40.73115                  N           663
## 3         40.71009                  N          2124

Verificar se existem nulos

sum(is.na(train))
## [1] 0

Summary do Dataset

summary(train)
##          id            vendor_id                pickup_datetime   
##  id0000001:      1   Min.   :1.000   2016-01-12 18:48:44:      5  
##  id0000003:      1   1st Qu.:1.000   2016-02-09 21:03:38:      5  
##  id0000005:      1   Median :2.000   2016-03-04 08:07:34:      5  
##  id0000008:      1   Mean   :1.535   2016-04-05 18:55:21:      5  
##  id0000009:      1   3rd Qu.:2.000   2016-05-07 13:18:07:      5  
##  id0000011:      1   Max.   :2.000   2016-06-10 23:17:17:      5  
##  (Other)  :1458638                   (Other)            :1458614  
##             dropoff_datetime   passenger_count pickup_longitude 
##  2016-02-19 19:25:04:      5   Min.   :0.000   Min.   :-121.93  
##  2016-05-16 19:40:28:      5   1st Qu.:1.000   1st Qu.: -73.99  
##  2016-01-07 08:04:32:      4   Median :1.000   Median : -73.98  
##  2016-01-08 12:43:38:      4   Mean   :1.665   Mean   : -73.97  
##  2016-01-08 13:00:41:      4   3rd Qu.:2.000   3rd Qu.: -73.97  
##  2016-01-09 15:59:42:      4   Max.   :9.000   Max.   : -61.34  
##  (Other)            :1458618                                    
##  pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag
##  Min.   :34.36   Min.   :-121.93   Min.   :32.18    N:1450599         
##  1st Qu.:40.74   1st Qu.: -73.99   1st Qu.:40.74    Y:   8045         
##  Median :40.75   Median : -73.98   Median :40.75                      
##  Mean   :40.75   Mean   : -73.97   Mean   :40.75                      
##  3rd Qu.:40.77   3rd Qu.: -73.96   3rd Qu.:40.77                      
##  Max.   :51.88   Max.   : -61.34   Max.   :43.92                      
##                                                                       
##  trip_duration    
##  Min.   :      1  
##  1st Qu.:    397  
##  Median :    662  
##  Mean   :    959  
##  3rd Qu.:   1075  
##  Max.   :3526282  
## 

Selecionando 10.000 registros aleatorios para analise -> jogando em um novo dataset que será utilizado daqui em diante

motivo: utilizar o dataset completo trava o RStudio em nossos equipamentos

set.seed(20)
# Criando uma sequencia de 1 ate a quantidade de total de linhas
linhas.idx <- seq_len(nrow(train))
# Obtendo aleatoriamente 10000 amostras de linhas do dataset
linhas.sample <- sample(linhas.idx, 5000)
# Amostra aleatoria com 10000 dados
df <- train[linhas.sample, ]
head(df, 10)
##                id vendor_id     pickup_datetime    dropoff_datetime
## 1279992 id1222043         1 2016-04-23 14:30:31 2016-04-23 14:46:12
## 1121016 id1532438         1 2016-06-29 13:56:47 2016-06-29 14:37:48
## 406908  id2622202         2 2016-06-29 19:06:05 2016-06-29 19:19:51
## 771860  id1738519         1 2016-02-16 22:46:58 2016-02-16 23:32:02
## 1404535 id1217189         2 2016-05-03 16:29:38 2016-05-03 16:32:45
## 1429984 id3391016         2 2016-05-08 20:46:41 2016-05-08 20:52:50
## 133222  id2918068         2 2016-05-07 21:25:28 2016-05-07 21:44:40
## 103198  id1453370         2 2016-02-13 15:58:02 2016-02-13 16:12:25
## 477841  id0952614         1 2016-03-24 20:06:00 2016-03-24 20:29:05
## 539804  id2150157         2 2016-06-14 10:46:55 2016-06-14 10:57:38
##         passenger_count pickup_longitude pickup_latitude dropoff_longitude
## 1279992               1        -74.00172        40.73539         -73.99198
## 1121016               2        -73.86301        40.76910         -73.98652
## 406908                5        -73.94592        40.78599         -73.97225
## 771860                1        -73.87315        40.77406         -73.68898
## 1404535               1        -73.94961        40.77686         -73.94212
## 1429984               1        -73.98979        40.74349         -73.99090
## 133222                2        -73.95290        40.77622         -73.99262
## 103198                2        -73.97441        40.74239         -73.99603
## 477841                1        -73.86289        40.76910         -73.95470
## 539804                1        -73.95792        40.76106         -73.94463
##         dropoff_latitude store_and_fwd_flag trip_duration
## 1279992         40.74020                  N           941
## 1121016         40.75035                  N          2461
## 406908          40.79433                  N           826
## 771860          41.04483                  N          2704
## 1404535         40.78674                  N           187
## 1429984         40.75052                  N           369
## 133222          40.75843                  N          1152
## 103198          40.73256                  N           863
## 477841          40.77395                  N          1385
## 539804          40.80746                  N           643

Preparacao dos dados

Regiao de saida

df$bairro_saida = mapply(define_bairro, df$pickup_longitude, df$pickup_latitude)
df$bairro_chegada = mapply(define_bairro, df$dropoff_longitude, df$dropoff_latitude)

Adiciona distancia Euclidiana calculada a partir das coordenadas (arquivo Preprocessing.R)

#Distancia em KM
df$dist_euclidiana = dist_eucl(df)

Adiciona distancia de Manhattan calculada a partir das coordenadas (arquivo Preprocessing.R)

df$dist_manhattan = dist_manh(df)
df$velocidade = df$dist_manhattan / df$trip_duration

Prepara data e hora da partida

Com isso e possivel pegar horario de pico e dia da semana

df$pickup_hour <- hour(df$pickup_datetime)
df$pickup_month <- month(df$pickup_datetime)
df$pickup_weekdays <- wday(df$pickup_datetime)

Transforma as variaveis de tempo em senoides

df$sen_hour <- sin(df$pickup_hour / 3.4)
df$sen_month <- sin(df$pickup_month / 1.7)
df$sen_week <- sin(df$pickup_weekdays)

Exemplo conversao do periodo de 24h em senoide

plot( sin(seq(from = 1, to = 24/3.4, length.out = 70 )), type = 'o' )

Limpeza de corridas zeradas e limpa corridas muito longas

df %>%
  filter(df$dist_manhattan > 0.5) -> df
df %>%
  filter(df$trip_duration < 10000) -> df

Analises descritivas

Divisao das regioes que separamos em NY:

Regioes NY

Regioes NY

Quantidade de viagens por regiao de saida e chegada

df %>%
  group_by(bairro_saida) %>%
  count() -> data_plot
plot1 = plot_ly(data= data_plot, x= ~bairro_saida, y= ~n, type = 'bar')

df %>%
  group_by(bairro_chegada) %>%
  count() -> data_plot
plot2 = plot_ly(data= data_plot, x= ~bairro_chegada, y= ~n, type = 'bar')

subplot(plot1, plot2, shareY = T)

Media da velocidade das viagens por regiao de saida e chegada

df %>%
  group_by(bairro_saida) %>%
  summarize(velocidade_media = mean(velocidade),n()) -> data_plot
plot1 = plot_ly(data= data_plot, x= ~bairro_saida, y= ~velocidade_media, type = 'bar')

df %>%
  group_by(bairro_chegada) %>%
  summarize(velocidade_media = mean(velocidade),n()) -> data_plot
plot2 = plot_ly(data= data_plot, x= ~bairro_chegada, y= ~velocidade_media, type = 'bar')

subplot(plot1, plot2, shareY = T)

Plotar correlacao passageiros tempo

p1 = plot_ly(data= df, x= ~passenger_count, y= ~trip_duration, type = 'scatter', mode = 'markers') 
p2 = plot_ly(data= df, x= ~dist_manhattan, y= ~trip_duration, type = 'scatter', mode = 'markers') %>% 
  layout(title="Correlacao Num. Passageiros vs. Tempo   |   Correlacao Distancia vs. Tempo")
subplot(p1, p2)

Media da velocidade das viagens por hora e dia da semana

df %>%
  group_by(pickup_hour) %>%
  summarize(velocidade_media = mean(velocidade),n()) -> data_plot1
plot1 = plot_ly(data= data_plot1, x= ~pickup_hour, y= ~velocidade_media, type = 'scatter', mode='lines')
df %>%
  group_by(pickup_weekdays) %>%
  summarize(velocidade_media = mean(velocidade),n()) -> data_plot2
plot2 = plot_ly(data= data_plot2, x= ~pickup_weekdays, y= ~velocidade_media, type = 'scatter', mode='lines') %>% 
  layout(title="Horas       |        Dias da Semana") 

subplot(plot1, plot2, shareY = T)

Quantidade de viagens por hora e dia da semana

df %>%
  group_by(pickup_hour) %>%
  count() -> data_plot1
plot1 = plot_ly(data= data_plot1, x= ~pickup_hour, y= ~n, type = 'bar')
df %>%
  group_by(pickup_weekdays) %>%
  count() -> data_plot2
plot2 = plot_ly(data= data_plot2, x= ~pickup_weekdays, y= ~n, type = 'bar') %>% 
  layout(title="Horas       |        Dias da Semana") 

subplot(plot1, plot2)

Plota mapa de calor de New York com ponto de partida da viagem

heat_map_taxi(train, "pickup")
## OGR data source with driver: ESRI Shapefile 
## Source: "D:\GITHUB\analise_taxi\data_source\mapa_ny", layer: "geo_export_8661594b-4f67-485f-8af1-84a4bd06054d"
## with 5 features
## It has 4 fields

Plota mapa de calor de New York com ponto de chegada da viagem

heat_map_taxi(train, "dropoff")
## OGR data source with driver: ESRI Shapefile 
## Source: "D:\GITHUB\analise_taxi\data_source\mapa_ny", layer: "geo_export_8661594b-4f67-485f-8af1-84a4bd06054d"
## with 5 features
## It has 4 fields

Executar uma analise de clusters (duracao da viagem) atraves do “kmeans”

kmeans_data <- df[, c("trip_duration", "dist_euclidiana")]
boxplot(kmeans_data[, c("trip_duration")], las=1, xlab="trip_duration")

boxplot(kmeans_data[, c("dist_euclidiana")], las=1, xlab=c("dist_euclidiana"))

normalized <-(kmeans_data-min(kmeans_data))/(max(kmeans_data)-min(kmeans_data))
clusters <- kmeans(normalized, centers = 3)
plot(normalized, col=clusters$cluster, pch=21, cex=1)

Calculando os intervalos de 15 em 15 minutos na coluna hour_quarter,

df %>%
  mutate(pickup_time_in_minutes = minute(pickup_datetime) + hour(pickup_datetime) * 60) %>% 
  mutate(hour_quarter = pickup_time_in_minutes %/% 15) -> df
head(df, 3)
##          id vendor_id     pickup_datetime    dropoff_datetime
## 1 id1222043         1 2016-04-23 14:30:31 2016-04-23 14:46:12
## 2 id1532438         1 2016-06-29 13:56:47 2016-06-29 14:37:48
## 3 id2622202         2 2016-06-29 19:06:05 2016-06-29 19:19:51
##   passenger_count pickup_longitude pickup_latitude dropoff_longitude
## 1               1        -74.00172        40.73539         -73.99198
## 2               2        -73.86301        40.76910         -73.98652
## 3               5        -73.94592        40.78599         -73.97225
##   dropoff_latitude store_and_fwd_flag trip_duration bairro_saida
## 1         40.74020                  N           941            5
## 2         40.75035                  N          2461            8
## 3         40.79433                  N           826            4
##   bairro_chegada dist_euclidiana dist_manhattan  velocidade pickup_hour
## 1              5       0.9800723       1.356218 0.001441252          14
## 2              5      10.6208499      12.499100 0.005078870          13
## 3              3       2.4052090       3.147032 0.003809967          19
##   pickup_month pickup_weekdays   sen_hour  sen_month   sen_week
## 1            4               7 -0.8282931  0.7094035  0.6569866
## 2            6               4 -0.6302978 -0.3781704 -0.7568025
## 3            6               4 -0.6403470 -0.3781704 -0.7568025
##   pickup_time_in_minutes hour_quarter
## 1                    870           58
## 2                    836           55
## 3                   1146           76
df %>%
  group_by(hour_quarter) %>% 
  summarise(count = n()) -> hour_quarter_freq
plot(hour_quarter_freq, type = "o", main="Grafico de linha temporal por quartos de hora", xlab="Quarto de hora", ylab="Numero de viagens")

df %>%
  group_by(pickup_month) %>% 
  summarise(count = n()) -> month_freq
plot(month_freq, type = "o", main="Grafico de linha temporal mensal", xlab="Mês", ylab="Número de viagens")

df %>%
  group_by(pickup_weekdays) %>% 
  summarise(count = n()) -> weekday_freq
plot(weekday_freq, type = "o", main="Gráfico de linha temporal por dia da semana", xlab="Dia da semana", ylab="Número de viagens")

Normalizar dados para o modelo

Primeiro, criando variaveis dummies para dia da semana e hora

bairro_dummy = dummy(df$bairro_chegada, sep='_')
df = data.frame(cbind(df, bairro_dummy))

Agora normalizando com Min Max Scaler as variaveis: distancia, trip_duration e passenger_count

df$dist_manhattan = rescale(df$dist_manhattan)
df$trip_duration = rescale(df$trip_duration)
df$passenger_count = rescale(df$trip_duration)